In cormac85/LinkedIn_Assignment:

LinkedIn {.tabset .tabset-fade .tabset-pills}

knitr::opts_chunk$set(echo = TRUE)

Introduction

Blah

library(tidyverse)
library(igraph)
library(tictoc)

members_df <-
  read_csv("./linkedin_data/common_connection_200k.csv")

members_sample_df <-
  members_df %>% sample_n(1000000) # take a sample

members_small_sample_df <-
  members_df %>% sample_n(100000) # take a sample

members_sample_graph <- 
  members_sample_df %>% 
  graph_from_data_frame(directed = FALSE)

members_small_sample_graph <- 
  members_small_sample_df %>% 
  graph_from_data_frame(directed = FALSE)

Exploration

print("Number of unique vertices:")
vertex_attr(members_sample_graph)$name %>% unique() %>% length()

Playing around with igraph we can use the graph to extract the original data frame:

edges_df <-
  get.edgelist(members_sample_graph) %>% 
  as.tibble() %>% 
  rename(member = V1, member_conn = V2)

edges_df %>% arrange(member, member_conn)

Algorithm 1

Algorithm 1 (finds node with most ) in English: 1. Get Member. 2. Get Member's Connections. 3. For Each Connection, Get List of Connections Connections. 4. Check If Connections Connections Are In Member's Connections.
5. If Yes: Increment & Store Counter for Member vs Connections. 6. Repeat 1-5 Until All Membership Pairs Are Accounted For.

Number of unique member names vs total number of member names in sample:

edges_df$member %>% unique() %>% length()
edges_df$member %>% length()
rm(edges_df)

The following bit will print for each member the number of friends who's friends are also friends of the member:

first_member <- 
  members_df %>% filter(member_id == "150503")

for(conn in first_member$connected_member_id) {
  conn_conns <- 
    members_df %>% filter(member_id == conn)

  # print(conn_conns)

  (conn_conns$connected_member_id %in%
    first_member$connected_member_id) %>% 
    sum(na.rm = TRUE) %>% print()
}

Algorithm 2

Get original member.
Get list of friends of member.
For each member in dataset (minus orignal member and direct friends of original member) check if friends of member are in the list of friends of the original member.

tictoc::tic("Total")
tictoc::tic("Create Nested data.frame")

nested_sample_df <- 
  members_sample_df %>% nest(connected_member_id) %>% 
  mutate(connected_member_ids = 
           map(data, function(x) x$connected_member_id) ) %>% 
  select(-data)
tictoc::toc()

first_member <- nested_sample_df %>% filter(member_id == "67890")
tictoc::tic("Calculate Number of Connections")

for(friends in first_member$connected_member_ids[[1]]){
  sample_less_friends <-
    nested_sample_df %>% 
    filter(!(member_id %in% union(first_member$member_id,
                                  friends)))

  sample_less_friends <- 
    sample_less_friends %>% 
    mutate(common_friends = map2_int(connected_member_ids, friends,
                                 function(x, y) {sum(x %in% y, na.rm = TRUE)} ))
}

tictoc::toc()
tictoc::toc()

create_nested_member_dataframe <- function(member_data) {
  member_data %>% nest(connected_member_id) %>% 
    mutate(connected_member_ids = 
             map(data, function(x) x$connected_member_id) ) %>% 
    select(-data)
}

nested_members_df <- create_nested_member_dataframe(members_df)

calculate_num_common_friends <- function(member_data, member_name){

  first_member <- member_data %>% filter(member_id == member_name)

  for(friends in first_member$connected_member_ids[[1]]){
    sample_less_friends <-
      member_data %>% 
      filter(!(member_id %in% union(first_member$member_id, friends)))

    sample_less_friends <- 
      sample_less_friends %>% 
      mutate(common_friends = map2_int(connected_member_ids, friends,
                                       function(x, y) {sum(x %in% y, na.rm = TRUE)} ))
  }

  sample_less_friends
}

tictoc::tic("Bad Algo 2")
calculate_num_common_friends(nested_members_df, "67890")
tictoc::toc()

Algorithm 3

Attempting to find friends of friends for a large sample (about 1*10^6 edges) took well over 2 hours to compute using the igraph package to calculate the neighbourhoods. I suspect the computation time is worse than O(n) so it could take over a day to complete this calculation for the full dataset.

Worryingly, the setdiff between degree 1 and 2 neighbourhoods is returning the empty set which seems very wrong. Given we have sampled about 1/10 of the full dataset it seems unlikely that we would not find any friends of friends!

tictoc::tic("Total")
tictoc::tic("1st Degree Neighbourhood")
neighbours_1_deg <- unlist(neighborhood(members_sample_graph, 1))
tictoc::toc()
tictoc::tic("2nd Degree Neighbourgood")
neighbours_2_deg <- unlist(neighborhood(members_sample_graph, 2))
tictoc::toc()
tictoc::tic("Friends of Friends")
second_deg_conns <- setdiff(neighbours_2_deg, neighbours_1_deg)
tictoc::toc()
tictoc::toc()

Algorithm 4

Using the igraph built in ego functions to calculate friends of friends for each vertex (member)
Convert to data.frame of edges (connections) for memory efficiency.
Remove empty and duplicate relationships.
For each friend of friend of each vertex, compare list of direct connections to original vertex's connections and sum any matches.
Sort by count of matches or just retrieve the maximum.

tic("Create friend of friend list")
f_of_f_small_sample <- ego(members_small_sample_graph, order = 2, mindist = 2, mode = "all")
toc()

# tic("Create friend of friend list")
# f_of_f_sample <- ego(members_sample_graph, order = 2, mindist = 2, mode = "all")
# toc()

tic("Create fof dataframe")
tic("Create nested df")
f_of_f_small_sample_df <- 
  tibble(member_id = V(members_small_sample_graph)$name,
         friend_of_friend = map(f_of_f_small_sample, function(x) x$name))
toc()

tic("Unnest df")
f_of_f_small_sample_df <- unnest(f_of_f_small_sample_df, friend_of_friend)
toc()

# Direction does not matter here.
tic("Remove duplicate relationships")
f_of_f_small_sample_df <-
  f_of_f_small_sample_df %>% 
  mutate(sorted_set = map2_chr(member_id, friend_of_friend, 
                               function(x, y) paste(sort(c(x, y)), collapse = "")
                           )
         ) %>% 
  distinct(sorted_set, .keep_all = TRUE) %>% 
  select(-sorted_set) %>% 
  mutate_all(as.integer)

toc()
toc()

Now we have a unique row for each friend of a friend connection. We can use this to look up the original member's direct connections and the friend of friend's direct connections, and count the amount of matches. Key to this will be speed of lookup of the direct connections.

Benchmarking

I should be using microbenchmark or similar package here, but tictoc should give us a good feel for the scale of the differences between lookup methods.

tic("Members lookup dplyr")
members_df %>% 
  filter(member_id == f_of_f_small_sample_df[10000, ]$member_id)
toc()


tic("Members lookup base")
members_df[members_df$member_id == f_of_f_small_sample_df[10000, ]$member_id, ]
toc()

This kind of time could be a problem for a large number of calculations. Let's try using the indexing feature from data.table.

tic("Use data.table")
tic("Members convert to data.table")
members_dt <- data.table::as.data.table(members_df)
toc()
tic("Add index")
data.table::setkey(members_dt, member_id)
toc()
tic("Simple lookup")
members_dt[member_id == f_of_f_small_sample_df[10000, ]$member_id]
toc()
toc()

It seems we might be able to reduce the lookup time by roughly a factor of 2 over base or dplyr, a definitive answer would require better use of benchmarking tools to get a sample of lookups.

Calculating the Answer

For now we have settled on using an indexed version of the members data frame, using data.table. We can use this method to efficiently lookup the friends from each member and from the friends of friends of member.

tic("Count common connections per relationship")
sum(members_dt[member_id == f_of_f_small_sample_df[10000, ]$member_id]$connected_member_id %in% members_dt[member_id == f_of_f_small_sample_df[10000, ]$friend_of_friend]$connected_member_id,
    na.rm = TRUE)
toc()

So for one set of lookups and calculation of common connections on the full indexed data.table it took about 0.04 seconds. This means the lookups for the small sample (100,000 members), we're looking at about 3 hours of calcuations. 2 lookups on an indexed table should scale as roughly O(log(n)), but this leaves us in trouble as we had 260k friend of a friend connections from 100k sample of members. The solution time could baloon in time and possibly memory when we bring in larger samples.

paste((0.04 / 3600) * 260337, "hours")

Industrialising the Solution

f_of_f_small_sample <- ego(members_small_sample_graph, order = 2, mindist = 2, mode = "all")


get_friends_of_friends <- function(igraph_connections, friends_of_friends_vertices) {
  # takes an igraph object of all connections and a list of vertices of all friends of friends
  # calculated by igraph::ego(igraph_obj, order = 2, mindist = 2, mode = "all")
  friends_of_friends_df <- 
    tibble(member_id = V(igraph_connections)$name,
           friend_of_friend = map(friends_of_friends_vertices, function(x) x$name))

  friends_of_friends_df <- unnest(friends_of_friends_df, friend_of_friend)

  friends_of_friends_df %>% 
    mutate(sorted_set = map2_chr(member_id, friend_of_friend, 
                                 function(x, y) paste(sort(c(x, y)), collapse = ""))) %>% 
    distinct(sorted_set, .keep_all = TRUE) %>% 
    select(-sorted_set) %>% 
    mutate_all(as.integer)
}

get_friends_of_friends(members_small_sample_graph, f_of_f_small_sample)

tic("Sample full calculation")
f_of_f_small_sample_df <-
  f_of_f_small_sample_df %>% 
  mutate(count_common_connections = 
           map2_int(member_id, friend_of_friend, calc_num_common_conns, members_dt))
toc()

calc_num_common_conns <- function(member, f_of_f, member_conns) {
  sum(member_conns[member_id == member]$connected_member_id %in%
        member_conns[member_id == f_of_f]$connected_member_id,
    na.rm = TRUE)
}